*Program name:      construct_horserace_data.do
*Written by:        Diane Alexander, Ezra Karger
*Date written:      April 19, 2020
*Last modified by:  Ezra Karger
*Date modified:     June 11, 2020

clear all


*Set sub-directory paths--same for all users:

global unacast  "${machine}/data/clean/unacast"
global stay_at_home "${machine}/data/clean/stay_at_home"
global womply "${machine}/data/clean/womply"
global nyt_cases "${machine}/data/clean/nyt_cases"
global census "${machine}/data/clean/census"
global horserace "${machine}/data/clean/horserace"
global trump "${machine}/data/clean/trump"
global acs "${machine}/data/clean/acs_2018"
global usda "${machine}/data/clean/usda_ers"
global tables "${machine}/tables/event_time"

global figures "${machine}/figures/event_time"

*Read in Diane's coded stay-at-home orders, and replace effective date with
*	one day later if the effective hour of the order was after 4pm?

use "${stay_at_home}/stay_at_home_AM.dta"
tab effective_hour, m
*replace effective_date=effective_date+1 if hh(effective_hour)>=16
*drop stay_at_home
*rename stay_at_home_a  stay_at_home

**Merge on Unacast data

merge 1:1 county_fips date using "${unacast}/social_distance_county_04_29.dta"
drop _merge

rename daily_distance_diff distance
rename daily_visitation_diff visit

**drop dates after April 17 (10 days after last order on April 7)
drop if date> 22022
 

**Merge on Womply data, but only after making Womply's data wide by category
*	of business. The categories include arts and entertainment (like movie theaters),
*	 bars, restaurants, and auto service shops.
*Final Womply data is at the date*county level, with variables for revenue from
*	each business type.
*For this analysis, we combine all businesses into two categories: food-related
*	establishments and other establishments

preserve
use "${womply}/womply_daily_revenue", clear
keep if year(date)==2020
keep if !missing(county_fips)

tab category, m

gen clean_cat="other"
replace clean_cat="food" if inlist(category,"bars and lounges","food and beverage shops","quick serve food and beverage businesses","restaurants")
tab category clean_cat, m


keep date county_fips revenue clean_cat intra_covid_active_merchants intra_covid_inactive_merchants merchants
rename revenue revenue_
rename intra_covid_active_merchants active_
rename intra_covid_inactive_merchants inactive_
rename merchants merchants_

collapse (sum) revenue_ active_ inactive_ merchants_, by(date county clean_cat)

reshape wide revenue_ active_ inactive_ merchants_, i(date county_fips) j(clean_cat) string

*Make 'all revenue' category
egen revenue_all = rowtotal(revenue_*)
egen inactive_all = rowtotal(inactive_*)
egen active_all = rowtotal(active_*)
egen merchants_all=rowtotal(merchants_*)
gen fr_i_all=inactive_all/(merchants_all)
gen fr_i_food=inactive_food/(merchants_food)
gen fr_i_other=inactive_other/(merchants_other)


*Missing observations in a date*county*category are instances with zero
*	revenue for that category of business.

foreach var of varlist revenue* {
	replace `var'=0 if missing(`var')
	
	gen log`var'=log(`var'+1)
}

**drop dates after April 17 (10 days after last order on April 7)
drop if date> 22022

tempfile womply
save "`womply'"
restore

merge 1:1 date county_fips using "`womply'", nogen


*Merge on USDA county-level characteristics
**CHECK THIS MERGE
merge m:1 county_fips using "${usda}/county_information.dta"
reg tot_pop pop_estimate_2018
*drop puerto rico
drop if county_fips>57000

*Create revenue per-capita estimates from the Womply panel

gen firstweekmarch=1 if date>=21975 & date<=21981

tab date if firstweekmarch==1


foreach v in food all other {
display "`var'"

	gen temp_estab_`v'=(merchants_`v') if firstweekmarch==1
	bysort county_fips: egen estab_earlymarch_`v'=mean(temp_estab_`v')
	gen perest_revenue_`v'=revenue_`v'/(estab_earlymarch_`v')
	drop temp_estab_`v'
}

rename perest_revenue_food pe_r_food
rename perest_revenue_other pe_r_other
rename perest_revenue_all pe_r_all

gen log_pe_food = log(pe_r_food+1)
gen log_pe_other = log(pe_r_other+1)
gen log_pe_all = log(pe_r_all+1)

sort county_fips date
 
*Merge on Trump vote share

preserve
import delimited using "${trump}/2016_US_County_Level_Presidential_Results.csv", clear

rename combined_fips county_fips
format county_fips %05.0f
gen gop_win_pct = per_gop-per_dem
keep county_fips gop_win_pct

sum gop_win_pct, d

tempfile trump
save "`trump'"
restore

merge m:1 county_fips using "`trump'", nogen

gen split_trump=gop_win_pct>0 & !missing(gop_win_pct)
gen split_dem=gop_win_pct<=0 & !missing(gop_win_pct)




**This just pulls the effective date of the stay-at-home order through the whole time period
foreach x in effective_date {
	bysort county_fips: egen `x'_t=mean(`x')
	drop `x'
	rename `x'_t `x'
}
format effective_date %td


** 88 counties don't show up in the social distance data, but these counties have very small pop

bysort county_fips: egen ever_stayathome=max(stay_at_home)

gen weekend=1 if weekday==1 | weekday==7

drop state_fips
gen temp=county_fips
format temp %05.0f
tostring temp,  replace usedisplayformat
gen state_fips=substr(temp ,1,2)
destring state_fips, replace
drop temp state



**unacast data starts feb 24==21969, but NYTimes data starts in January
merge 1:1 county_fips date using "${nyt_cases}/nyt_cases_04_17.dta", nogen 

**missings in the cases/deaths are zeroes
replace cases=0 if cases==.
replace cases_daily_incr=0 if cases_daily_incr==.
replace deaths=0 if deaths==.
replace deaths_daily_incr=0 if deaths_daily_incr==.


*Construct county-level dataset, which contains all county-level covariates,
*	as well as:
*		1. change in each outcome from March 1, 2020 - April 11, 2020
*		2. change in outcome from t-10 to t-1 (pre-order)
*		3. change in outcome from t-1 to t+1 (immediate response)
*		4. change in outcome from t+1 to t+10 (post-order response)
*		5. change in outcome from t-10 to t+10 (around-order response)


*First, drop counties with no stay-at-home order:

drop if ever_stayathome==0


*Keep data from March 1 onward

drop if date<mdy(3,1,2020) | missing(effective_date)


*define the window for the eventtime graphs
local window 11
display `window'
local total (2*`window'+1)

**event time 0 is the day before effective date of order
gen eventtime=date-effective_date  +1

replace eventtime = -11 if eventtime<-11
replace eventtime = 11 if eventtime>11
tab eventtime, m

preserve


**event time 10 is the effective date of order
**shift event time variables because stata doesn't like negative categories

replace eventtime=eventtime+`window'
tab eventtime
**16 is in the middle

**omitted cat is -1
fvset base 11 eventtime 
duplicates report county_fips eventtime


reghdfe visit i.eventtime [aweight=pop_estimate_2018] , absorb(date county_fips) cluster(state_fips)
reghdfe distance i.eventtime [aweight=pop_estimate_2018] , absorb(date county_fips) cluster(state_fips)
reghdfe pe_r_food i.eventtime [aweight=pop_estimate_2018] , absorb(date county_fips) cluster(state_fips)
restore



sort county_fips date

local outcomes "distance visit pe_r_food pe_r_other pe_r_all cases deaths log_pe_food log_pe_other log_pe_all"

foreach o of local outcomes {

	gen t_val_march1 = `o' if date==mdy(3,1,2020)
	gen t_val_apr11 = `o' if date==mdy(4,11,2020)
	
	gen t_val_event_m10 = `o' if eventtime==-10
	gen t_val_event_m0 = `o' if eventtime==0
	gen t_val_event_p2 = `o' if eventtime==2
	gen t_val_event_p10 = `o' if eventtime==10
	
	foreach var of varlist t_val_* {
	
		by county: egen temp = min(`var')
		replace `var'=temp if missing(`var')
		drop temp
		
	}
	
	rename t_val_* `o'_val_*

}


*Subset to county-level variables

keep if !missing(date_announced)

keep state_fips county_fips date_announced effective_date gop_win_pct *_val_* ///
	metro_2013 civilian_labor_force_2018 employed_2018 unemployed_2018 ///
	unemployment_rate_2018 median_household_income_2018 pop_estimate_2018 ///
	births_2018 deaths_2018 gq_estimates_2018 pct_lt_highschool_2014_2018 ///
	pct_highschool_2014_2018 pct_somecollege_2014_2018 pct_college_plus_2014_2018 ///
	ruralurban_continuum_code_2013 pctpovall_2018

duplicates drop
duplicates report county_fips


*Save copy of the dataset
save "${horserace}/horserace_data.dta", replace

